In [ ]:
#!pip install deep_tabular_augmentation
import os
import numpy as np
import torch
from torch import nn
from torch import optim
from sklearn.preprocessing import StandardScaler
from functools import partial
from vpower.src.utils.auxiliary_functions import *
import matplotlib.pyplot as plt
In [ ]:
#!pip install sdv
In [ ]:
import sdv
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.lite import SingleTablePreset
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import CopulaGANSynthesizer
LOAD DATA¶
In [ ]:
data_dir_s = "../../../data/synthetic_data"
data_dir_r = "../../../data/real_data"
data_dir_da ="../../../data/augmented_data"
data_tag="RD"
In [ ]:
# Load data partitions and assign them to dictionary data_all
data_all = {}
for partition in ["train", "dev_in", "dev_out"]:
df = load_data_and_set_index(filepath=os.path.join(data_dir_s, f"{partition}.csv"), index_column_name="time_id")
data_all[partition+"_s"] = df
for partition in ["train", "dev_in", "dev_out"]:
df = load_data_and_set_index(filepath=os.path.join(data_dir_r, f"{partition}.csv"), index_column_name="time_id")
data_all[partition+"_r"] = df
if data_tag=="RD":
data_all["train"]=data_all["train_r"]
data_all["dev_in"]=data_all["dev_in_r"]
else:
data_all["train"]= pd.concat([data_all["train_r"], data_all["train_s"]])
data_all["dev_in"] = pd.concat([data_all["dev_in_r"], data_all["dev_in_s"]])
In [ ]:
real_data=data_all["train"]
len(real_data)
Out[ ]:
530706
In [ ]:
## Metadate Detection
In [ ]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=real_data)
print('Auto detected data:\n')
metadata
Auto detected data:
Out[ ]:
{
"columns": {
"draft_aft_telegram": {
"sdtype": "numerical"
},
"draft_fore_telegram": {
"sdtype": "numerical"
},
"stw": {
"sdtype": "numerical"
},
"diff_speed_overground": {
"sdtype": "numerical"
},
"awind_vcomp_provider": {
"sdtype": "numerical"
},
"awind_ucomp_provider": {
"sdtype": "numerical"
},
"rcurrent_vcomp": {
"sdtype": "numerical"
},
"rcurrent_ucomp": {
"sdtype": "numerical"
},
"comb_wind_swell_wave_height": {
"sdtype": "numerical"
},
"timeSinceDryDock": {
"sdtype": "numerical"
},
"power": {
"sdtype": "numerical"
}
},
"METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}
In [ ]:
#GaussianCopulaSynthesizer
# Step 1: Create the synthesizer
synthesizer = GaussianCopulaSynthesizer(metadata)
# Step 2: Train the synthesizer
synthesizer.fit(real_data)
synthesizer.save(
filepath='my_GaussianCopulaSynthesizer.pkl')
/home/user/.local/lib/python3.8/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'stw'. Data will not be rounded. warnings.warn( /home/user/.local/lib/python3.8/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'diff_speed_overground'. Data will not be rounded. warnings.warn( /home/user/.local/lib/python3.8/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'awind_vcomp_provider'. Data will not be rounded. warnings.warn( /home/user/.local/lib/python3.8/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'awind_ucomp_provider'. Data will not be rounded. warnings.warn( /home/user/.local/lib/python3.8/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'rcurrent_vcomp'. Data will not be rounded. warnings.warn( /home/user/.local/lib/python3.8/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'rcurrent_ucomp'. Data will not be rounded. warnings.warn( /home/user/.local/lib/python3.8/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'comb_wind_swell_wave_height'. Data will not be rounded. warnings.warn(
In [ ]:
#GaussianCopulaSynthesizer
# Step 1: Create the synthesizer
synthesizer = CopulaGANSynthesizer(metadata)
# Step 2: Train the synthesizer
synthesizer.fit(real_data)
synthesizer.save(
filepath='my_CopulaGANSynthesizer.pkl')
/home/user/.local/lib/python3.8/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'stw'. Data will not be rounded. /home/user/.local/lib/python3.8/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'diff_speed_overground'. Data will not be rounded. /home/user/.local/lib/python3.8/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'awind_vcomp_provider'. Data will not be rounded. /home/user/.local/lib/python3.8/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'awind_ucomp_provider'. Data will not be rounded. /home/user/.local/lib/python3.8/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'rcurrent_vcomp'. Data will not be rounded. /home/user/.local/lib/python3.8/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'rcurrent_ucomp'. Data will not be rounded. /home/user/.local/lib/python3.8/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'comb_wind_swell_wave_height'. Data will not be rounded.
In [ ]:
# synthesizer = CopulaGANSynthesizer.load(filepath='my_CopulaGANSynthesizer.pkl')
# Step 3: Generate synthetic data
synthetic_data = synthesizer.sample(num_rows=25000)
print(len(synthetic_data))
25000
In [ ]:
feature_under_study = "awind_vcomp_provider"
plt.figure(figsize=(6, 6))
plt.scatter(synthetic_data["stw"],synthetic_data['power'] / 1e3,
c=synthetic_data[feature_under_study],
s=4, label="FAKE")
plt.legend(loc="upper left")
plt.xlabel("Speed (knots)")
plt.ylabel("Power (MW)")
#plt.xlim(4, 25)
#plt.ylim(-2, 45)
plt.grid()
cbar = plt.colorbar()
cbar.set_label(feature_under_study, rotation=90)
plt.clim(-15, 50)
plt.show()
plt.close()
Evaluating real vs. synthetic data¶
In [ ]:
from sdv.evaluation.single_table import evaluate_quality
quality_report = evaluate_quality(
real_data,
synthetic_data,
metadata
)
Creating report: 100%|██████████| 4/4 [00:07<00:00, 1.83s/it]
Overall Quality Score: 95.3% Properties: Column Shapes: 93.78% Column Pair Trends: 96.83%
In [ ]:
quality_report.get_visualization('Column Shapes')
In [ ]:
quality_report.get_visualization('Column Pair Trends')
In [ ]:
from sdv.evaluation.single_table import get_column_plot
fig = get_column_plot(
real_data=real_data,
synthetic_data=synthetic_data,
column_name='power',
metadata=metadata
)
fig.show()
In [ ]:
from sdv.evaluation.single_table import get_column_plot
fig = get_column_plot(
real_data=real_data,
synthetic_data=synthetic_data,
column_name='stw',
metadata=metadata
)
fig.show()
In [ ]:
from sdv.evaluation.single_table import get_column_pair_plot
fig = get_column_pair_plot(
real_data=real_data,
synthetic_data=synthetic_data,
column_names=['stw', 'power'],
metadata=metadata
)
fig.show()